Steps in Data Preprocessing These are the steps:
Import dataset
Finding for missing values
Encoding categorical data
Data splitting
Feature Scaling
import pandas as pd
import numpy as np
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Documentation
import handcalcs.render
# Plot
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm # color map
import seaborn as sns
import plotly.express as px
from sympy import Sum, symbols, Indexed, lambdify, diff
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from mpl_toolkits.mplot3d.axes3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
# Path
data_path = './Data/'
data = pd.read_csv(data_path+"wfp_food_prices_nga.csv", low_memory=False).reset_index(drop=True)
data.shape
(80982, 14)
data
| date | admin1 | admin2 | market | latitude | longitude | category | commodity | unit | priceflag | pricetype | currency | price | usdprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | #date | #adm1+name | #adm2+name | #loc+market+name | #geo+lat | #geo+lon | #item+type | #item+name | #item+unit | #item+price+flag | #item+price+type | #currency | #value | #value+usd |
| 1 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Maize | KG | actual | Wholesale | NGN | 175.92 | 1.5525 |
| 2 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Millet | KG | actual | Wholesale | NGN | 150.18 | 1.3254 |
| 3 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Rice (imported) | KG | actual | Wholesale | NGN | 358.7 | 3.1656 |
| 4 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Sorghum | KG | actual | Wholesale | NGN | 155.61 | 1.3733 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 80977 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Onions | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80978 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Oranges | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80979 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Spinach | 300 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80980 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Tomatoes | 0.5 KG | forecast | Retail | NGN | 0.0 | 0.0 |
| 80981 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Watermelons | 2.1 KG | forecast | Retail | NGN | 0.0 | 0.0 |
80982 rows × 14 columns
column_info = data.iloc[0]
column_info
date #date admin1 #adm1+name admin2 #adm2+name market #loc+market+name latitude #geo+lat longitude #geo+lon category #item+type commodity #item+name unit #item+unit priceflag #item+price+flag pricetype #item+price+type currency #currency price #value usdprice #value+usd Name: 0, dtype: object
data = data.drop(0).reset_index(drop=True)
data
| date | admin1 | admin2 | market | latitude | longitude | category | commodity | unit | priceflag | pricetype | currency | price | usdprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Maize | KG | actual | Wholesale | NGN | 175.92 | 1.5525 |
| 1 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Millet | KG | actual | Wholesale | NGN | 150.18 | 1.3254 |
| 2 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Rice (imported) | KG | actual | Wholesale | NGN | 358.7 | 3.1656 |
| 3 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | cereals and tubers | Sorghum | KG | actual | Wholesale | NGN | 155.61 | 1.3733 |
| 4 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08 | 7.24 | pulses and nuts | Beans (niebe) | KG | actual | Wholesale | NGN | 196.87 | 1.7374 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 80976 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Onions | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80977 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Oranges | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80978 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Spinach | 300 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 80979 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Tomatoes | 0.5 KG | forecast | Retail | NGN | 0.0 | 0.0 |
| 80980 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062664985656738 | 11.171370506286621 | vegetables and fruits | Watermelons | 2.1 KG | forecast | Retail | NGN | 0.0 | 0.0 |
80981 rows × 14 columns
# Rename the columns
data.rename(columns={ 'date': 'Date', 'admin1': 'Admin1', 'admin2': 'Admin2', 'market': 'Market', 'latitude': 'Latitude', 'longitude': 'Longitude', 'category': 'Category',
'commodity': 'Commodity', 'unit': 'Unit', 'priceflag': 'Price Flag', 'pricetype': 'Price Type', 'currency': 'Currency', 'price': 'Price', 'usdprice':'USD Price' }, inplace=True)
data.columns
Index(['Date', 'Admin1', 'Admin2', 'Market', 'Latitude', 'Longitude',
'Category', 'Commodity', 'Unit', 'Price Flag', 'Price Type', 'Currency',
'Price', 'USD Price'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 80981 entries, 0 to 80980 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 80981 non-null object 1 Admin1 80981 non-null object 2 Admin2 80981 non-null object 3 Market 80981 non-null object 4 Latitude 80981 non-null object 5 Longitude 80981 non-null object 6 Category 80981 non-null object 7 Commodity 80981 non-null object 8 Unit 80981 non-null object 9 Price Flag 80981 non-null object 10 Price Type 80981 non-null object 11 Currency 80981 non-null object 12 Price 80981 non-null object 13 USD Price 80981 non-null object dtypes: object(14) memory usage: 8.6+ MB
data['Date'] = data['Date'].astype('datetime64[ns]')
data['Latitude'] = data['Latitude'].astype(float)
data['Longitude'] = data['Longitude'].astype(float)
data['Price'] = data['Price'].astype(float)
data['USD Price'] = data['USD Price'].astype(float)
data
| Date | Admin1 | Admin2 | Market | Latitude | Longitude | Category | Commodity | Unit | Price Flag | Price Type | Currency | Price | USD Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.080000 | 7.240000 | cereals and tubers | Maize | KG | actual | Wholesale | NGN | 175.92 | 1.5525 |
| 1 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.080000 | 7.240000 | cereals and tubers | Millet | KG | actual | Wholesale | NGN | 150.18 | 1.3254 |
| 2 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.080000 | 7.240000 | cereals and tubers | Rice (imported) | KG | actual | Wholesale | NGN | 358.70 | 3.1656 |
| 3 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.080000 | 7.240000 | cereals and tubers | Sorghum | KG | actual | Wholesale | NGN | 155.61 | 1.3733 |
| 4 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.080000 | 7.240000 | pulses and nuts | Beans (niebe) | KG | actual | Wholesale | NGN | 196.87 | 1.7374 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 80976 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Onions | 400 G | forecast | Retail | NGN | 0.00 | 0.0000 |
| 80977 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Oranges | 400 G | forecast | Retail | NGN | 0.00 | 0.0000 |
| 80978 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Spinach | 300 G | forecast | Retail | NGN | 0.00 | 0.0000 |
| 80979 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Tomatoes | 0.5 KG | forecast | Retail | NGN | 0.00 | 0.0000 |
| 80980 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Watermelons | 2.1 KG | forecast | Retail | NGN | 0.00 | 0.0000 |
80981 rows × 14 columns
for column in data.columns:
if column not in ['Date', 'Latitude', 'Longitude', 'Price', 'USD Price']:
print("-------------------------------------------------",column,"---------------------------------------------------")
print(data[column].unique())
print("--------------------------------------------------------------------------------------------------------------")
------------------------------------------------- Admin1 --------------------------------------------------- ['Katsina' 'Sokoto' 'Borno' 'Kano' 'Jigawa' 'Oyo' 'Lagos' 'Kaduna' 'Zamfara' 'Abia' 'Gombe' 'Kebbi' 'Adamawa' 'Yobe'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Admin2 --------------------------------------------------- ['Jibia' 'Gada' "Mai'Adua" 'Mobbar' 'Dawakin Tofa' 'Maigatari' 'Ibadan North' 'Maiduguri' 'Kosofe' 'Giwa' 'Kaura Namoda' 'Oboma Ngwa' 'Akko' 'Kaugama' 'Lere' 'Dandume' 'Gwandu' 'Hong' 'Biu' 'Damaturu' 'Potiskum' 'Konduga' 'Geidam' 'Gujba' 'Jakusko' 'Karasuwa' 'Nguru' 'Yunusari' 'Yusufari' 'Borsari' 'Gulani'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Market --------------------------------------------------- ['Jibia (CBM)' 'Illela (CBM)' 'Mai Adoua (CBM)' 'Damassack (CBM)' 'Dawanau' 'Mai Gatari (CBM)' 'Ibadan' 'Maiduguri' 'Lagos' 'Giwa' 'Kaura Namoda' 'Aba' 'Gombe' 'Gujungu' 'Saminaka' 'Dandume' 'Gwandu' 'Mubi' 'Biu' 'Damaturu' 'Potiskum' 'Abba Gamaram' 'Baga Road' 'Bullunkutu' 'Budum' 'Custom' 'Kusawam Shanu' 'Monday' 'Tashan Bama' 'Bolori Stores' 'Damaturu (Sunday Market)' 'Geidam' 'Gujba (Buni Yadi)' 'Jakusko' 'Bade (Gashua)' 'Nguru' 'Yunusari' 'Yusufari' 'Bursari' 'Gulani (Tettaba)'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Category --------------------------------------------------- ['cereals and tubers' 'pulses and nuts' 'non-food' 'oil and fats' 'meat, fish and eggs' 'milk and dairy' 'vegetables and fruits' 'miscellaneous food'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Commodity --------------------------------------------------- ['Maize' 'Millet' 'Rice (imported)' 'Sorghum' 'Beans (niebe)' 'Wheat' 'Maize (white)' 'Sorghum (white)' 'Rice (milled, local)' 'Bread' 'Cassava meal (gari, yellow)' 'Gari (white)' 'Maize (yellow)' 'Rice (local)' 'Sorghum (brown)' 'Yam (Abuja)' 'Fuel (diesel)' 'Fuel (petrol-gasoline)' 'Oil (palm)' 'Cowpeas (brown)' 'Cowpeas (white)' 'Yam' 'Groundnuts (shelled)' 'Maize flour' 'Meat (beef)' 'Meat (goat)' 'Milk' 'Oil (vegetable)' 'Beans (red)' 'Beans (white)' 'Groundnuts' 'Onions' 'Fish' 'Eggs' 'Bananas' 'Oranges' 'Spinach' 'Watermelons' 'Cowpeas' 'Tomatoes' 'Salt' 'Sugar'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Unit --------------------------------------------------- ['KG' '100 KG' '50 KG' 'Unit' 'L' '100 L' '1.3 KG' '1.4 KG' '3.4 KG' '20 G' '750 ML' '1.1 KG' '0.5 KG' '1.5 KG' '30 pcs' '400 G' '300 G' '2.1 KG' '3.1 KG' '1.2 KG' '250 G' '100 Tubers'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Price Flag --------------------------------------------------- ['actual' 'actual,aggregate' 'aggregate' 'forecast'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Price Type --------------------------------------------------- ['Wholesale' 'Retail'] -------------------------------------------------------------------------------------------------------------- ------------------------------------------------- Currency --------------------------------------------------- ['NGN'] --------------------------------------------------------------------------------------------------------------
data.isnull().sum()
Date 0 Admin1 0 Admin2 0 Market 0 Latitude 0 Longitude 0 Category 0 Commodity 0 Unit 0 Price Flag 0 Price Type 0 Currency 0 Price 0 USD Price 0 dtype: int64
# The Price Flag column has forecast
test_data = data[data['Price Flag'] == 'forecast'].reset_index(drop=True)
test_data
| Date | Admin1 | Admin2 | Market | Latitude | Longitude | Category | Commodity | Unit | Price Flag | Price Type | Currency | Price | USD Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2006-01-15 | Sokoto | Gada | Illela (CBM) | 13.645000 | 5.278000 | cereals and tubers | Wheat | KG | forecast | Wholesale | NGN | 0.0 | 0.0 |
| 1 | 2006-02-15 | Sokoto | Gada | Illela (CBM) | 13.645000 | 5.278000 | cereals and tubers | Wheat | KG | forecast | Wholesale | NGN | 0.0 | 0.0 |
| 2 | 2006-03-15 | Katsina | Mai'Adua | Mai Adoua (CBM) | 13.180000 | 8.230000 | cereals and tubers | Wheat | KG | forecast | Wholesale | NGN | 0.0 | 0.0 |
| 3 | 2006-03-15 | Sokoto | Gada | Illela (CBM) | 13.645000 | 5.278000 | cereals and tubers | Wheat | KG | forecast | Wholesale | NGN | 0.0 | 0.0 |
| 4 | 2006-04-15 | Katsina | Mai'Adua | Mai Adoua (CBM) | 13.180000 | 8.230000 | cereals and tubers | Wheat | KG | forecast | Wholesale | NGN | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13944 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Onions | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 13945 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Oranges | 400 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 13946 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Spinach | 300 G | forecast | Retail | NGN | 0.0 | 0.0 |
| 13947 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Tomatoes | 0.5 KG | forecast | Retail | NGN | 0.0 | 0.0 |
| 13948 | 2023-12-15 | Yobe | Yusufari | Yusufari | 13.062665 | 11.171371 | vegetables and fruits | Watermelons | 2.1 KG | forecast | Retail | NGN | 0.0 | 0.0 |
13949 rows × 14 columns
data = data[data['Price Flag'] != 'forecast'].reset_index(drop=True)
data
| Date | Admin1 | Admin2 | Market | Latitude | Longitude | Category | Commodity | Unit | Price Flag | Price Type | Currency | Price | USD Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08000 | 7.24000 | cereals and tubers | Maize | KG | actual | Wholesale | NGN | 175.92 | 1.5525 |
| 1 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08000 | 7.24000 | cereals and tubers | Millet | KG | actual | Wholesale | NGN | 150.18 | 1.3254 |
| 2 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08000 | 7.24000 | cereals and tubers | Rice (imported) | KG | actual | Wholesale | NGN | 358.70 | 3.1656 |
| 3 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08000 | 7.24000 | cereals and tubers | Sorghum | KG | actual | Wholesale | NGN | 155.61 | 1.3733 |
| 4 | 2002-01-15 | Katsina | Jibia | Jibia (CBM) | 13.08000 | 7.24000 | pulses and nuts | Beans (niebe) | KG | actual | Wholesale | NGN | 196.87 | 1.7374 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 67027 | 2023-01-15 | Zamfara | Kaura Namoda | Kaura Namoda | 12.59519 | 6.58635 | pulses and nuts | Cowpeas (brown) | KG | aggregate | Retail | NGN | 298.55 | 0.6480 |
| 67028 | 2023-01-15 | Zamfara | Kaura Namoda | Kaura Namoda | 12.59519 | 6.58635 | pulses and nuts | Cowpeas (white) | 100 KG | aggregate | Wholesale | NGN | 29440.00 | 63.8992 |
| 67029 | 2023-01-15 | Zamfara | Kaura Namoda | Kaura Namoda | 12.59519 | 6.58635 | pulses and nuts | Cowpeas (white) | KG | aggregate | Retail | NGN | 274.82 | 0.5965 |
| 67030 | 2023-01-15 | Zamfara | Kaura Namoda | Kaura Namoda | 12.59519 | 6.58635 | pulses and nuts | Groundnuts (shelled) | 100 KG | aggregate | Wholesale | NGN | 46960.00 | 101.9261 |
| 67031 | 2023-01-15 | Zamfara | Kaura Namoda | Kaura Namoda | 12.59519 | 6.58635 | pulses and nuts | Groundnuts (shelled) | KG | aggregate | Retail | NGN | 467.66 | 1.0151 |
67032 rows × 14 columns
Encoding categorical data is a crucial step in preparing data for machine learning models, as many algorithms require numerical input. Categorical data represents variables that can take on a limited, and usually fixed, number of values. There are several common techniques for encoding categorical data:
# Create a bar chart using Seaborn
fig = px.bar(data, y='USD Price', x='Category', color="Category", title="Default: various text sizes, positions and angles")
fig.show()